# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

# BAGGING, BOOSTING, and RANDOM FORESTS

# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install scikit-learn;
! pip install matplotlib;
! pip install ISLP;

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt;
from ISLP import load_data

# Load the Auto dataset from package ISLP
from ISLP import load_data
Auto = load_data('Auto')

print(Auto.columns)

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'year', 'origin'],
      dtype='object')

# X = Auto.drop(columns=["mpg", "name"])
X = Auto.drop(columns=["mpg"])
y = Auto["mpg"]

# 1. Bagging and Random Forests
# Random forest model with default settings (similar to R's default m = p/3)
rf = RandomForestRegressor()
rf.fit(X, y)

# Model summary
print(f"Mean Squared Error: {mean_squared_error(y, rf.predict(X)):.4f}")
print(f"Variance explained (%): {rf.score(X, y) * 100:.2f}")

Mean Squared Error: 1.0002
Variance explained (%): 98.35

# Feature importance
importance = rf.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': importance}).sort_values(by='Importance', ascending=False)
print(importance_df)

# Plotting feature importance
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel("Importance")
plt.title("Feature Importance in Random Forest")
plt.show()

        Feature  Importance
1  displacement    0.375632
3        weight    0.205921
0     cylinders    0.140268
2    horsepower    0.121348
5          year    0.120379
4  acceleration    0.030020
6        origin    0.006431

# 2. Cross-validation

# Reset indices for X and y
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

# Randomly split data into training and validation sets
Z = np.random.choice(len(y), 200, replace=False)

# Select training and test sets based on Z
X_train, X_test = X.loc[Z], X.drop(Z)
y_train, y_test = y.loc[Z], y.drop(Z)

# Fit random forest on training subset
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Calculate mean squared error
mse_cv = mean_squared_error(y_test, y_pred)
print(f"Cross-validated MSE: {mse_cv:.4f}")

Cross-validated MSE: 6.1369

# 3. Searching for the optimal solution

reg_tree = RandomForestRegressor(oob_score=True, n_estimators=147, max_features=3, random_state=42)
reg_tree.fit(X_train, y_train)

# Access the OOB score
print("OOB Score:", reg_tree.oob_score_)

import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

plt.figure(figsize=(30, 20))
plot_tree(reg_tree.estimators_[0], feature_names=X_train.columns, filled=True, fontsize=10)
plt.show()

OOB Score: 0.828643068370655

# Sample sizes for trees
n_trees = range(1, 300, 10)  # You can adjust this range as needed
oob_scores = []

# Collect OOB scores for different numbers of trees
for n in n_trees:
    reg_tree = RandomForestRegressor(n_estimators=n, oob_score=True, random_state=42)
    reg_tree.fit(X_train, y_train)
    oob_scores.append(reg_tree.oob_score_)

# Plotting OOB error
plt.figure(figsize=(10, 6))
plt.plot(n_trees, oob_scores, marker='o', linestyle='-', color='b')
plt.title('OOB Score vs. Number of Trees')
plt.xlabel('Number of Trees')
plt.ylabel('OOB Score')
plt.grid()
plt.show()

C:\Users\baron\AppData\Local\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:615: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates.
  warn(

# Finding the optimal number of trees to minimize MSE and maximize R²
errors = reg_tree.oob_score_
optimal_trees = np.argmin(errors) + 1
print(f"Optimal number of trees: {optimal_trees}")

# Final random forest with optimal trees
rf_optimal = RandomForestRegressor(max_features=7, n_estimators=optimal_trees)
rf_optimal.fit(X, y)

print(f"Optimal Model MSE: {mean_squared_error(y, rf_optimal.predict(X)):.4f}")
print(f"Optimal Model R²: {rf_optimal.score(X, y) * 100:.2f}")

Optimal number of trees: 1
Optimal Model MSE: 5.9720
Optimal Model R²: 90.17

# 4. Cross-validation loop to find the best m and number of trees
cv_errors = []
n_trees = []

for m in range(1, 8):
    rf_m = RandomForestRegressor(max_features=m, oob_score=True)
    rf_m.fit(X_train, y_train)
    mse_min = np.min(rf_m.oob_score_)
    opt_trees = np.argmin(rf_m.oob_score_) + 1
    cv_errors.append(mse_min)
    n_trees.append(opt_trees)

# Plot CV errors
plt.figure(figsize=(10, 6))
plt.plot(range(1, 8), cv_errors, marker='o')
plt.xlabel("Number of Features (m)")
plt.ylabel("Cross-Validation Error (MSE)")
plt.title("Cross-Validation Error for Different m Values")
plt.show()

# Print results
optimal_m = np.argmin(cv_errors) + 1
print(f"Optimal m: {optimal_m}, with MSE: {cv_errors[optimal_m-1]:.4f}")

# Optimal random forest with best m and number of trees
rf_final = RandomForestRegressor(max_features=optimal_m, n_estimators=n_trees[optimal_m-1])
rf_final.fit(X, y)

print(f"Final Optimal Model MSE: {mean_squared_error(y, rf_final.predict(X)):.4f}")
print(f"Final Optimal Model R²: {rf_final.score(X, y) * 100:.2f}")

# Feature importance for the optimal model
importance_optimal = rf_final.feature_importances_
importance_df_optimal = pd.DataFrame({'Feature': features, 'Importance': importance_optimal}).sort_values(by='Importance', ascending=False)
print(importance_df_optimal)

Optimal m: 7, with MSE: 0.8105
Final Optimal Model MSE: 4.7235
Final Optimal Model R²: 92.23
        Feature  Importance
0     cylinders    0.651802
2    horsepower    0.130907
5          year    0.101565
3        weight    0.066039
4  acceleration    0.031475
1  displacement    0.015715
6        origin    0.002498